{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__title__      = ML in Action Chapter9 Code（regTrees）\n",
    "__author__     = wgj\n",
    "__createDate__ = 2018-11-05 20:30:28"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import *\n",
    "\n",
    "def loadDataSet(fileName): \n",
    "    dataMat = []                           #  最后1列为目标值\n",
    "    fr = open(fileName)\n",
    "    for line in fr.readlines():\n",
    "        curLine = line.strip().split('\\t')         # 移除每行首尾空格并用tab分割数据\n",
    "        fltLine = list(map(float,curLine))     # 将每行映射成浮点数(Python3 需在map前加list)\n",
    "        dataMat.append(fltLine)\n",
    "    return dataMat\n",
    "\n",
    "\"\"\" 对数据集进行2向切分 \"\"\"\n",
    "def binSplitDataSet(dataSet, feature, value):\n",
    "    \"\"\" \n",
    "    输入：1、dataSet——数据集（m行xn列）\n",
    "                2、feature——选择用来切分的特征（列索引）\n",
    "                3、value——特征值\n",
    "    输出：1、mat0——数据集在特征feature下 > value的分支\n",
    "                2、mat1——数据集在特征feature下<= value的分支\n",
    "    \"\"\"\n",
    "    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]\n",
    "    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]\n",
    "#     mat0 = dataSet[dataSet[:,feature] > value,:]        # 仅能用于array\n",
    "#     mat1 = dataSet[dataSet[:,feature] <= value,:]\n",
    "    return mat0,mat1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\" 计算目标变量均值 \"\"\"\n",
    "def regLeaf(dataSet):#returns the value used for each leaf\n",
    "    return mean(dataSet[:,-1])\n",
    "\n",
    "\"\"\" 计算目标变量总方差 \"\"\"\n",
    "def regErr(dataSet):\n",
    "    return var(dataSet[:,-1]) * shape(dataSet)[0]\n",
    "\n",
    "\"\"\" 选择最佳切分特征 \"\"\"\n",
    "def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):\n",
    "    \"\"\" \n",
    "    输入：1、dataSet——数据集（m行xn列）\n",
    "                2、leafType——建立叶节点函数的引用\n",
    "                3、errType——误差计算函数的引用\n",
    "                4、ops——其他参数（元组）\n",
    "    输出：1、bestIndex——最佳用来切分的特征（列索引）\n",
    "                2、bestValue——特征值\n",
    "    \"\"\"\n",
    "    tolS = ops[0];    # 容许的误差下降值\n",
    "    tolN = ops[1]    # 切分出的最小样本数\n",
    "    \n",
    "    # 如果剩余1个特征则退出（1）\n",
    "    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: \n",
    "        return None, leafType(dataSet)\n",
    "    m,n = shape(dataSet)\n",
    "    #the choice of the best feature is driven by Reduction in RSS error from mean\n",
    "    S = errType(dataSet)                                            # 当前数据集的误差\n",
    "    bestS = inf; bestIndex = 0; bestValue = 0         # 初始化最小误差、最佳特征、最佳特征值\n",
    "    for featIndex in range(n-1):                               # 遍历所有特征\n",
    "        for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):        # 遍历该特征的所有值\n",
    "            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)                 # 进行切分\n",
    "            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue      # 跳过样本数<阈值的切分\n",
    "            newS = errType(mat0) + errType(mat1)                                                 # 计算当前切分下的两组样本集的误差和\n",
    "            if newS < bestS:                                            # 保留误差较小的分割结果\n",
    "                bestIndex = featIndex\n",
    "                bestValue = splitVal\n",
    "                bestS = newS\n",
    "                # 最佳切分即使得切分后能达到最低误差的切分\n",
    "\n",
    "    # 误差减小量小于阈值则退出（2）\n",
    "    if (S - bestS) < tolS: \n",
    "        return None, leafType(dataSet) \n",
    "    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)\n",
    "    \n",
    "    # 如果切分出的数据集小于阈值则退出（3）\n",
    "    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): \n",
    "        return None, leafType(dataSet)\n",
    "    return bestIndex,bestValue\n",
    "\n",
    "\"\"\" 构建树 \"\"\"\n",
    "def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#数据集为NumPy Mat可进行索引\n",
    "    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)  # 选择最佳切分\n",
    "    if feat == None: return val #if the splitting hit a stop condition return val\n",
    "    retTree = {}                            # 初始化树为空字典\n",
    "    retTree['spInd'] = feat          # 树节点所对应的特征\n",
    "    retTree['spVal'] = val             #  树节点所对应的特征值\n",
    "    lSet, rSet = binSplitDataSet(dataSet, feat, val)                        # 按照选择的特征进行数据集切分\n",
    "    retTree['left'] = createTree(lSet, leafType, errType, ops)        # 构建左子树\n",
    "    retTree['right'] = createTree(rSet, leafType, errType, ops)     # 构建右子树\n",
    "    return retTree"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试回归树构建函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'spInd': 1,\n",
       " 'spVal': 0.39435,\n",
       " 'left': {'spInd': 1,\n",
       "  'spVal': 0.582002,\n",
       "  'left': {'spInd': 1,\n",
       "   'spVal': 0.797583,\n",
       "   'left': 3.9871632,\n",
       "   'right': 2.9836209534883724},\n",
       "  'right': 1.980035071428571},\n",
       " 'right': {'spInd': 1,\n",
       "  'spVal': 0.197834,\n",
       "  'left': 1.0289583666666666,\n",
       "  'right': -0.023838155555555553}}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dat1 = loadDataSet('ex0.txt')\n",
    "dat1 = mat(dat1)\n",
    "createTree(dat1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "模型树相关函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def linearSolve(dataSet):   #helper function used in two places\n",
    "    m,n = shape(dataSet)\n",
    "    X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion\n",
    "    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y\n",
    "    xTx = X.T*X\n",
    "    if linalg.det(xTx) == 0.0:\n",
    "        raise NameError('This matrix is singular, cannot do inverse,\\n\\\n",
    "        try increasing the second value of ops')\n",
    "    ws = xTx.I * (X.T * Y)\n",
    "    return ws,X,Y\n",
    "\n",
    "def modelLeaf(dataSet):#create linear model and return coeficients\n",
    "    ws,X,Y = linearSolve(dataSet)\n",
    "    return ws\n",
    "\n",
    "def modelErr(dataSet):\n",
    "    ws,X,Y = linearSolve(dataSet)\n",
    "    yHat = X * ws\n",
    "    return sum(power(Y - yHat,2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "树回归与标准回归的比较"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def isTree(obj):\n",
    "    return (type(obj).__name__=='dict')\n",
    "\n",
    "\"\"\" 对回归树叶节点进行预测 \"\"\"\n",
    "def regTreeEval(model, inDat):\n",
    "    return float(model)                # 回归树的叶节点即为均值\n",
    "\n",
    "\"\"\" 对模型树叶节点进行预测 \"\"\"\n",
    "def modelTreeEval(model, inDat):\n",
    "    n = shape(inDat)[1]\n",
    "    X = mat(ones((1,n+1)))\n",
    "    X[:,1:n+1]=inDat              # 与上1行的共同作用是对原始数据集前增加1个1\n",
    "    return float(X*model)   # X与系数W的乘积即为预测值\n",
    "\n",
    "\"\"\" 自顶向下遍历树，返回叶节点预测值 \"\"\"\n",
    "def treeForeCast(tree, inData, modelEval=regTreeEval):\n",
    "    \"\"\" \n",
    "    输入：1、tree——树结构\n",
    "                2、inData——输入数据（单点或行向量）\n",
    "                3、modelEval——对叶节点数据进行预测函数的引用\n",
    "    输出：叶节点预测值\n",
    "    \"\"\"\n",
    "    if not isTree(tree): return modelEval(tree, inData)    # 叶子节点直接返回预测值\n",
    "    if inData[tree['spInd']] > tree['spVal']:\n",
    "        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)\n",
    "        else: return modelEval(tree['left'], inData)\n",
    "    else:\n",
    "        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)\n",
    "        else: return modelEval(tree['right'], inData)\n",
    "\n",
    "\"\"\" 对一组测试数据集计算预测值 \"\"\"\n",
    "def createForeCast(tree, testData, modelEval=regTreeEval):\n",
    "    \"\"\" \n",
    "    输入：1、tree——树结构\n",
    "                2、testData——测试数据集\n",
    "                3、modelEval——对叶节点数据进行预测函数的引用\n",
    "    输出：yHat——一组预测值（行向量）\n",
    "    \"\"\"\n",
    "    m = len(testData)\n",
    "    yHat = mat(zeros((m,1)))\n",
    "    for i in range(m):\n",
    "        yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)\n",
    "    return yHat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "进行比较"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9640852318222145"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\" 创建1颗回归树 \"\"\"\n",
    "trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))\n",
    "testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))\n",
    "myTree = createTree(trainMat, ops=(1,20))\n",
    "yHat = createForeCast(myTree, testMat[:,0])\n",
    "corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9760412191380623"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\" 创建1颗模型树 \"\"\"\n",
    "myTree = createTree(trainMat, modelLeaf, modelErr,ops=(1,20))\n",
    "yHat = createForeCast(myTree, testMat[:,0], modelTreeEval)\n",
    "corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9434684235674762"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\" 标准线性回归 \"\"\"\n",
    "ws, X, Y = linearSolve(trainMat)\n",
    "for i in range(shape(testMat)[0]):\n",
    "    yHat[i] = testMat[i,0]*ws[1,0]+ws[0,0]\n",
    "corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "由上述结果可知，在相关系数度量下，三种方法优略如下：\n",
    "标准线性回归<回归树<模型树"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
